Outline

There are more and more Bioconductor packages supporting single-cell data analysis. R Amezquita, A Lun, S Hicks, and R Gottardo wrote an integrated workflow, Orchestrating Single-Cell Analysis with Bioconductor, for single-cell data analysis and quality assessment. In this session, we will go through several important QC metrics which can’t be made with Seurat.

  • How to differentiate empty droplets?
  • How to estimate ambient RNA and remove them?
  • How to identify doublets?
  • Checking for confounded effect?

Loading data and empty drops


Load data from Cell Ranger Result

We can load the data in from 10X with the DropletUtils function, read10xCounts().

library(DropletUtils)
library(DropletTestFiles)
fname <- "~/filepath/toCellRanger/results"
sce <- read10xCounts(fname, col.names = TRUE)
cellID <- colData(sce)$Barcode

Subset dataset

We have a small subset version of this dataset that you can load in from the data/ directory to try this out.

## class: SingleCellExperiment 
## dim: 27998 100000 
## metadata(1): Samples
## assays(1): counts
## rownames(27998): ENSMUSG00000051951 ENSMUSG00000089699 ...
##   ENSMUSG00000096730 ENSMUSG00000095742
## rowData names(2): ID Symbol
## colnames(100000): CGTGTCTTCGCATGAT-1 TTTATGCGTCGAACAG-1 ...
##   TTATGCTGTGTTTGTG-1 GCTGCGACACGTTGGC-1
## colData names(2): Sample Barcode
## reducedDimNames(0):
## altExpNames(0):

SingleCellExperiment Object

read10xCounts() reads the data in as a specialist object called a SingleCellExperiment.

sce
## class: SingleCellExperiment 
## dim: 27998 100000 
## metadata(1): Samples
## assays(1): counts
## rownames(27998): ENSMUSG00000051951 ENSMUSG00000089699 ...
##   ENSMUSG00000096730 ENSMUSG00000095742
## rowData names(2): ID Symbol
## colnames(100000): CGTGTCTTCGCATGAT-1 TTTATGCGTCGAACAG-1 ...
##   TTATGCTGTGTTTGTG-1 GCTGCGACACGTTGGC-1
## colData names(2): Sample Barcode
## reducedDimNames(0):
## altExpNames(0):

SingleCellExperiment Object

The colData() and rowData() functions can be used to access experiment metadata.

# cell information
colData(sce)[1:2, ]
## DataFrame with 2 rows and 2 columns
##                                    Sample            Barcode
##                               <character>        <character>
## CGTGTCTTCGCATGAT-1 ~/Desktop/01_scSeq_t.. CGTGTCTTCGCATGAT-1
## TTTATGCGTCGAACAG-1 ~/Desktop/01_scSeq_t.. TTTATGCGTCGAACAG-1
# gene information
rowData(sce)[1:2, ]
## DataFrame with 2 rows and 2 columns
##                                    ID      Symbol
##                           <character> <character>
## ENSMUSG00000051951 ENSMUSG00000051951        Xkr4
## ENSMUSG00000089699 ENSMUSG00000089699      Gm1992

Access UMI counts in each droplet

bcrank <- barcodeRanks(counts(sce))
bcrank[1:2, ]
## DataFrame with 2 rows and 3 columns
##                         rank     total    fitted
##                    <numeric> <numeric> <numeric>
## CGTGTCTTCGCATGAT-1   41200.5         1        NA
## TTTATGCGTCGAACAG-1   41200.5         1        NA

Knee plot

uniq <- !duplicated(bcrank$rank)
# 
plot(bcrank$rank[uniq], bcrank$total[uniq], log = "xy", xlab = "Rank", ylab = "Total UMI count", 
    cex.lab = 1.2)

abline(h = metadata(bcrank)$inflection, col = "darkgreen", lty = 2)
abline(h = metadata(bcrank)$knee, col = "dodgerblue", lty = 2)

legend("bottomleft", legend = c("Inflection", "Knee"), col = c("darkgreen", "dodgerblue"), 
    lty = 2, cex = 1.2)

Knee Plot

## Warning in xy.coords(x, y, xlabel, ylabel, log): 1 y value <= 0 omitted from
## logarithmic plot

Identify non-empty droplets

The emptydrops function can be used to identify goog/bad droplets. 2 keys arguments are: - limit: lowest counts per cell - test.ambient: Could be used to estimate ambient RNA contamination

set.seed(100)
limit <- 100
e.out <- emptyDrops(counts(sce), lower = limit, test.ambient = TRUE)
# 
e.out
## DataFrame with 100000 rows and 5 columns
##                        Total     LogProb     PValue   Limited        FDR
##                    <integer>   <numeric>  <numeric> <logical>  <numeric>
## CGTGTCTTCGCATGAT-1         1    -5.34170 0.70912909     FALSE 0.95697174
## TTTATGCGTCGAACAG-1         1    -6.44503 0.50414959     FALSE 0.92603663
## CATGACACAAGTCTGT-1         0          NA         NA        NA         NA
## ACATACGCACCACCAG-1        58  -266.20222 0.01839816     FALSE 0.45640979
## CGTCCATAGCTGCAAG-1      1601 -2917.18710 0.00009999      TRUE 0.00507393
## ...                      ...         ...        ...       ...        ...
## GTGCTTCGTCACCCAG-1         0          NA         NA        NA         NA
## GGAAAGCCAAGGCTCC-1         1    -9.06371 0.20007999     FALSE   0.818381
## TGCCCTAGTAGCTCCG-1         1   -13.51757 0.00309969     FALSE   0.125927
## TTATGCTGTGTTTGTG-1         0          NA         NA        NA         NA
## GCTGCGACACGTTGGC-1         0          NA         NA        NA         NA

Identify non-empty droplets

# Testeed by FDR
summary(e.out$FDR <= 0.001)
##    Mode   FALSE    TRUE    NA's 
## logical   54010     591   45399
# Concordance by testing with FDR and limited
table(Sig = e.out$FDR <= 0.001, Limited = e.out$Limited)
##        Limited
## Sig     FALSE  TRUE
##   FALSE 53525   485
##   TRUE      1   590

Distribution of non-empty reads

We can plot the distribution of significance of non-empty reads

hist(e.out$PValue[e.out$Total <= limit & e.out$Total > 0], xlab = "P-value", main = "", 
    col = "grey80")

Subset non-empty droplets

We can filter to just our non-empty droplets using a simple which query on the FDR from emptyDrops(). Here we are using a 0.001 cut-off.

sce2 <- sce[, which(e.out$FDR <= 0.001)]

Normalization and clustering


Counts normalization

library(scran)
library(scuttle)
library(scater)
clusters <- quickCluster(sce2)
sce2 <- computeSumFactors(sce2, cluster = clusters)
sce2 <- logNormCounts(sce2)
sce2
## class: SingleCellExperiment 
## dim: 27998 591 
## metadata(1): Samples
## assays(2): counts logcounts
## rownames(27998): ENSMUSG00000051951 ENSMUSG00000089699 ...
##   ENSMUSG00000096730 ENSMUSG00000095742
## rowData names(2): ID Symbol
## colnames(591): GCTCCTAAGGCACATG-1 GTAACTGTCGGATGGA-1 ...
##   CACCAGGTCACCTTAT-1 AGGTCATCATGTAAGA-1
## colData names(3): Sample Barcode sizeFactor
## reducedDimNames(0):
## altExpNames(0):

Identify variable features

set.seed(1000)
# modeling variables
dec.pbmc <- modelGeneVarByPoisson(sce2)
# calcualte top features
top.pbmc <- getTopHVGs(dec.pbmc, prop = 0.1)

Make TSNE and UMAP plots

set.seed(1000)
# Evaluate PCs
sce2 <- denoisePCA(sce2, subset.row = top.pbmc, technical = dec.pbmc)
# make TSNE plot
sce2 <- runTSNE(sce2, dimred = "PCA")
# make UMAP plot
sce2 <- runUMAP(sce2, dimred = "PCA")

Graphic based clustering

g <- buildSNNGraph(sce2, k = 10, use.dimred = "PCA")
clust <- igraph::cluster_walktrap(g)$membership
colLabels(sce2) <- factor(clust)
# 
colData(sce2)
## DataFrame with 591 rows and 4 columns
##                                    Sample            Barcode sizeFactor
##                               <character>        <character>  <numeric>
## GCTCCTAAGGCACATG-1 ~/Desktop/01_scSeq_t.. GCTCCTAAGGCACATG-1   0.913482
## GTAACTGTCGGATGGA-1 ~/Desktop/01_scSeq_t.. GTAACTGTCGGATGGA-1   0.970723
## CGGACACCATTACGAC-1 ~/Desktop/01_scSeq_t.. CGGACACCATTACGAC-1   0.725295
## GCGAGAAAGCTACCGC-1 ~/Desktop/01_scSeq_t.. GCGAGAAAGCTACCGC-1   0.700158
## AAACGGGCAGATGGGT-1 ~/Desktop/01_scSeq_t.. AAACGGGCAGATGGGT-1   1.636871
## ...                                   ...                ...        ...
## GTGGGTCAGCACCGTC-1 ~/Desktop/01_scSeq_t.. GTGGGTCAGCACCGTC-1   1.026750
## CGCTATCAGTACGACG-1 ~/Desktop/01_scSeq_t.. CGCTATCAGTACGACG-1   0.677916
## GTACTTTTCCTTTCGG-1 ~/Desktop/01_scSeq_t.. GTACTTTTCCTTTCGG-1   0.978623
## CACCAGGTCACCTTAT-1 ~/Desktop/01_scSeq_t.. CACCAGGTCACCTTAT-1   1.130759
## AGGTCATCATGTAAGA-1 ~/Desktop/01_scSeq_t.. AGGTCATCATGTAAGA-1   1.689680
##                       label
##                    <factor>
## GCTCCTAAGGCACATG-1        2
## GTAACTGTCGGATGGA-1        2
## CGGACACCATTACGAC-1        2
## GCGAGAAAGCTACCGC-1        6
## AAACGGGCAGATGGGT-1        4
## ...                     ...
## GTGGGTCAGCACCGTC-1        6
## CGCTATCAGTACGACG-1        6
## GTACTTTTCCTTTCGG-1        1
## CACCAGGTCACCTTAT-1        7
## AGGTCATCATGTAAGA-1        4

UMAP plot

plotUMAP(sce2, colour_by = "label")

tSNE plot

plotTSNE(sce2, colour_by = "label")

Removing Ambient RNA


Ambient RNA

  • Cell-free RNAs can contaminate droplets.
  • They can be estimated by empty droplets.
# extrat potential ambient RNA and thee estimated score
amb <- metadata(e.out)$ambient[, 1]
head(amb)
## ENSMUSG00000051951 ENSMUSG00000025902 ENSMUSG00000033845 ENSMUSG00000025903 
##       5.808442e-07       5.808442e-07       1.067188e-04       6.783426e-05 
## ENSMUSG00000033813 ENSMUSG00000002459 
##       5.876210e-05       5.808442e-07

Remove ambient RNA

library(scater)
stripped <- sce2[names(amb), ]
out <- removeAmbience(counts(stripped), ambient = amb, groups = colLabels(stripped))

Integrate corrected counts

counts(stripped, withDimnames = FALSE) <- out
stripped <- logNormCounts(stripped)

Before/After removal

  • Hemoglobin A1 (Hba-a1) as example
  • In most cases the Hbs are contaminated from residual RBCs
ensmbl_id <- rowData(sce2)$ID[rowData(sce2)$Symbol == "Hba-a1"]
plotExpression(sce2, x = "label", colour_by = "label", features = ensmbl_id) + ggtitle("Before")

plotExpression(stripped, x = "label", colour_by = "label", features = ensmbl_id) + 
    ggtitle("After")

Before/After removal

  • Hemoglobin A1 (Hba-a1) as example

Before/After removal

  • Krt17 as example
ensmbl_id <- rowData(sce2)$ID[rowData(sce2)$Symbol == "Krt17"]
plotExpression(sce2, x = "label", colour_by = "label", features = ensmbl_id) + ggtitle("Before")
plotExpression(stripped, x = "label", colour_by = "label", features = ensmbl_id) + 
    ggtitle("After")

Before/After removal

  • Krt17 as example

Save result

Again we can save our results for later using a Rds object.

saveRDS(stripped, "scSeq_CTRL_sceSub_rmAmbRNA.rds")

Removing Doublets


Doublets

  • Doublets means two or more cells clumped in a single droplet. Thus, the read counts and genes detected in this droplet would be much higher than other droplets.
  • Here, we demonstrate how to identify doublets using simulation with scran

Normalization and clustering

dec <- modelGeneVar(stripped)
hvgs <- getTopHVGs(dec, n = 1000)
stripped <- runPCA(stripped, ncomponents = 10, subset_row = hvgs)
stripped <- runUMAP(stripped, dimred = "PCA")
g <- buildSNNGraph(stripped, k = 10, use.dimred = "PCA")
clust <- igraph::cluster_walktrap(g)$membership
colLabels(stripped) <- factor(clust)
plotUMAP(stripped, colour_by = "label")

Normalization and clustering

Estimate doublets

The computeDoubletDensity() fucntion can be used on your SingleCellExperiment object to estimate doublets.

dbl.dens <- computeDoubletDensity(stripped, #subset.row=top.mam, 
    d=ncol(reducedDim(stripped)),subset.row=hvgs)
summary(dbl.dens)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.2908  0.7978  1.1442  1.2442  1.6010  4.1961
stripped$DoubletScore <- dbl.dens

Plot doublets scores ~ UMAP

We can project the doublet scores onto our UMPA to see if doublet correlates with the data distribution.

plotUMAP(stripped, colour_by = "DoubletScore")

Plot doublets scores by cluster

Likewise we can look for over representation of doublets in clusters.

plotColData(stripped, x = "label", y = "DoubletScore", colour_by = "label") + geom_hline(yintercept = quantile(colData(stripped)$DoubletScore, 
    0.95), lty = "dashed", color = "red")

- No clusters have significantly higher doublet scores than other clusters. No clusters would be removed. - Red dash line represented 95% quantile of doublet score. The cells with higher doublet score than this cut-off would be removed.

Remove doublets

We can clean our data up based on this 95% quantile cut-off.

cut_off <- quantile(stripped$DoubletScore, 0.95)
stripped$isDoublet <- c("no", "yes")[factor(as.integer(stripped$DoubletScore >= cut_off), 
    levels = c(0, 1))]
table(stripped$isDoublet)
## 
##  no yes 
## 561  30
sce_clean <- stripped[stripped$isDoublet == "no", ]
saveRDS(sce_clean, "scSeq_CTRL_sceSub_rmAmbRNA_rmDoublet.rds")

QC plots after clearance


QC plots

  • Mitochondrial content
  • Genes detected
  • Reads per cell
library(scater)
mtGene <- rowData(sce_clean)$ID[grepl(rowData(sce_clean)$Symbol, pattern = "mt-")]
is.mito <- names(sce_clean) %in% mtGene
sce_clean <- addPerCellQC(sce_clean, subsets = list(Mito = is.mito))

QC plots - Read counts

Post-clean

plotColData(sce_clean, x = "label", y = "sum", colour_by = "label") + ggtitle("read counts")

QC plots - Gene counts

Post-clean

plotColData(sce_clean, x = "label", y = "detected", colour_by = "label") + ggtitle("gene counts")

QC plots - Mitochondrial %

Post-clean

plotColData(sce_clean, x = "label", y = "subsets_Mito_percent", colour_by = "label") + 
    ggtitle("mitocondrial content")

QC plots ~ comparison

  • Mitochondrial contents vs read counts
plotColData(sce_clean, x = "sum", y = "subsets_Mito_percent", colour_by = "label") + 
    ggtitle("is.mito vs read counts")

QC plots ~ comparison

  • Gene counts vs read counts
plotColData(sce_clean, x = "sum", y = "detected", colour_by = "label") + ggtitle("gene counts vs read counts")

Estimate variance explaination

  • Clustering (label), mitochondrial content (subsets_Mito_percent), doublets (DoubletScore), read counts (sum), and gene counts (detected) were tested.
  • We would suppose “label” (clustering) would explain more variances than other controls.
vars <- getVarianceExplained(sce_clean, variables = c("DoubletScore", "label", "sum", 
    "detected", "subsets_Mito_percent"))
plotExplanatoryVariables(vars)

Estimate variance explaination

plotExplanatoryVariables(vars)
## Warning: Removed 2765 rows containing non-finite values (stat_density).

Exercise Time